library(VA,lib.loc="nfs/home/D/dhopkins/.R/library")
library(e1071)

### iterations in estimation algorithm
nn <- 150

### observations in synthetic data
MMM <- 1000
    
symm <- c(5,6,7,8,9,10)

### vector of possible subsets
subb <- c(300,300,300,300,1)

n.symp <- 6

#### store results of iterations
resmat <- matrix(NA,length(symm),length(subb))
smatni <-  smat <- tmat2 <-resmat2 <- matrix(NA,nn,7)

#### for cross-validation
#### test for best svm
ss <- seq(from=0.000001,to=0.1,length=15)
coeff <- seq(from=-1,to=1,by=.025)
mm <- length(coeff)

rmat <- array(NA,dim=c(length(ss),mm,nn))
for(ii in 1:nn){
  sym <- symm[ii]
  sub <- subb[1]
    
####number of synthetic words/symptoms
  num.of.words <- 10

####number of categories in the dependent variable
  n.cat <- 7

####number of observations for synthetic data set
  nobs <- MMM

####P(D_1)
  probs2 <- c(.14,.14,.14,.14,.15,.14,.15)

####P(D_2)
  probs1 <- c(.05,.18,.09,.18,.09,.17,.24)

####observed categories | P(D_1)
  cat1 <- sample(1:n.cat,nobs,prob=probs1,replace=T)

###create P(S|D)
  sprob <- matrix(0,n.cat,num.of.words)
  for(i in 1:n.cat){
    sprob[i,] <- sample((1:10)/10-.05,num.of.words,replace=T)
  }

###P(S) when P(D)==P(D_1)
  symp1 <- matrix(0,nobs,num.of.words)
  for(i in 1:nobs){
    symp1[i,] <- rbinom(num.of.words,size=1,prob=sprob[cat1[i],])
  }
###observed categories | P(D_2)
  cat2 <- sample(1:n.cat,nobs,prob=probs2,replace=T)
###data set when P(D)==P(D_1)
  symp2 <- matrix(0,nobs,num.of.words)
  for(i in 1:nobs){
    symp2[i,] <- rbinom(num.of.words,size=1,prob=sprob[cat2[i],])
  }
####
  train <- as.data.frame(cbind(symp2,cat2))
  test <- as.data.frame(cbind(symp1,cat1))
  cn <- c()
  for(i in 1:num.of.words){
    cn <- c(cn,paste("WORD",i,sep=""))
  }
  colnames(train) <- colnames(test) <- c(cn,"jointcode")

  sttxt <- cn[1]
  fntxt <- cn[i]

  
  ### run VA
  txt <- paste("vout1<-va(cbind(",sttxt,"+...+",fntxt,")~jointcode,data=list(train,test),nsymp=n.symp,n.subset=300,printit=F,print.reg.size=F)",sep="")
  eval(parse(text=txt))

  ### run SVM
  train$jointcode2 <- as.factor(train$jointcode)
  train$jointcode <- NULL

  ### add pairwise interactions
  optmat <- combn(num.of.words,2)
  opts <- dim(optmat)[2]
  for(zzz in 1:opts){
    o1 <- optmat[1,zzz]
    o2 <- optmat[2,zzz]
    txti <- paste("train$interact",zzz,"<- train$WORD",o1,"*train$WORD",o2,sep="")
    eval(parse(text=txti))
  txti <- paste("test$interact",zzz,"<- test$WORD",o1,"*test$WORD",o2,sep="")
    eval(parse(text=txti))
  }
    
  sout1 <- svm(jointcode2~.,data=train,gamma=ss[5],type="C-classification",coef0=coeff[51],kernel="polynomial",cross=10)
 
  ### final output
  p1 <- predict(sout1,newdata=test)
  smat[ii,] <- table(p1)/sum(table(p1)) 
  resmat2[ii,] <- vout1$est.CSMF
  tmat2[ii,] <- vout1$true.CSMF

  ### no interaction

  sout2 <- svm(jointcode2~WORD1+WORD2+WORD3+WORD4+WORD5+WORD6+WORD7+WORD8+WORD9+WORD10,data=train,gamma=ss[5],type="C-classification",coef0=coeff[51],kernel="polynomial",cross=10)
  p2 <- predict(sout2,newdata=test)
  smatni[ii,] <- table(p2)/sum(table(p2))  

}


mean(apply(abs(smat-tmat2),1,mean))
mean(apply(abs(resmat2-tmat2),1,mean))


#pdf("~/projects-space/readme/words/figs/optimalsvm.pdf")
#wireframe(ttmat,zlab="Percent \n Correct",zlim=c(60,80),xlab="Gamma",ylab="Coef",shade=T)
#dev.off()

### determine optimal parameters
#aa <- c()
#for(i in 1:15){
#  aa <- c(aa,which(rmat[,,i]==max(rmat[,,i])))
#}
#whichcol <- 1+aa/15



#### GRAPH
xx <- "Actual P(D)"
yy <- "Estimated P(D)"


#txtt<-paste("~/projects-space/readme/wds/figs/syntheticFP040108.pdf",sep="")

txtt<-paste("~/poliblog/replication/syntheticFP041408B.pdf",sep="")
pdf(txtt)
par(pty="s")
plot(apply(tmat2,2,mean),apply(resmat2,2,mean),xlim=c(0,.3),ylim=c(0,.3),xlab=xx,ylab=yy,cex.lab=1.8,cex=2)
par(new=T)
#plot(apply(tmat2,2,mean),apply(smat,2,mean),xlim=c(0,.3),ylim=c(0,.3),xlab=xx,ylab=yy,cex.lab=1.8,cex=1,pch=16)
#par(new=T)
#plot(apply(tmat2,2,mean),apply(smatni,2,mean),xlim=c(0,.3),ylim=c(0,.3),xlab=xx,ylab=yy,cex.lab=1.8,cex=1.5,pch=16)

abline(c(0,0),c(1,1))
dev.off()


#  }



library(combn)
i <- 1
rs <- sample(1:10,6,replace=F)
sprob2<-sprob[,rs]

###create matrix of profiles
mm <- matrix(0,2^n.symp,n.symp)
k  <-  2
vec1 <- rep(1,n.symp)
for(i in 1:n.symp){
  cc <- combn(6,i)
  kk <- dim(cc)[2] 
  for(j in 1:kk){
    mm[k,cc[,j]] <- 1
    k <- k+1
  }
}

probmat1 <- matrix(NA,2^n.symp,n.cat)
for(zz in 1:n.cat){
  pp<-sprob2[zz,]
  pp2 <- 1-pp
  for(yy in 1:2^n.symp){
    probmat1[yy,zz] <- prod(c(pp[mm[yy,]==1],pp2[mm[yy,]==0]))
  }
}




pdf("/nfs/fs1/projects/dhopkins/readme/wds/figs/changedistFPSVM2.pdf",height=10,width=10)
par(mfcol=c(1,2))
par(pty="s")
cc <- 1.5
cl <- 1.2
cm <- 1.5

par(mfcol=c(1,2))
matplot(c(probs2),c(probs1),
     xlab=expression(paste(P^h,"(D)")),
     ylab=expression(paste("P(D)")),
     cex=cc,
     cex.main=cm,
        cex.lab=cl,
     main="Differences in Document \n Category Frequencies",
        xlim=c(0,.4),ylim=c(0,.4),
        pch=1)
abline(a=0,b=1)

##analytically estimated
ps1 <- probmat1%*%probs1
ps2 <- probmat1%*%probs2

matplot(c(ps2),c(ps1),
        cex=cc,
        xlab=expression(paste(P^h,"(S)")),
        ylab=expression(paste("P(S)")),
        main="Differences in Word \n Profile Frequencies",
            ylim=c(0.002,.075),xlim=c(0.0025,.075),
        cex.lab=cl,
        cex.main=cm,
        log="xy",
        pch=1)
abline(a=0,b=1)
dev.off()

